'''自制的网页爬虫,类似wget,webzip的用法.
  爬虫获取的文本类网页将被保存到指定文件夹,
  而资源类则保存到该文件夹下的resource文件夹下.
  所有文件被命名为原根目录之后的/换成下划线.
  网页都在跟文件夹,资源都在resource文件夹,是比较简单的爬虫
  
  如果需要过滤,则继承本类并重写_filter函数.
  
  除了初始页面爬虫,还可以进行页面补充,补充的页面只爬一层,
  用于初始页无法找到的页面.想更复杂还是得重载.
  
  目前没有对付iframe的代码,暂时放过他吧...
  貌似可以,用了几次没什么毛病.小站点资料能用
  by setycyas @2024-05-07
  '''

import os
from urllib.parse import urlparse
import re

from bs4 import BeautifulSoup
import requests

import UrlHandler

class MyWebGet:

  def __init__(self, topUrl, outputDir, level = 3, headers = None, encoding = 'utf-8'):
    """初始化
    指定爬虫的顶页面,保存文件夹,递归层数,headers,网页编码指定
    """
    ## 复制初始参数
    self._topUrl = topUrl
    self._outputDir = outputDir
    self._level = level
    if headers == None:
      self._headers = {}
    else:
      self._headers = headers
    self._encoding = encoding
      
    ## 其他初始参数
    self.urlHandler = UrlHandler.UrlHandler(topUrl)
    self._htmlRecord = {} # 已下载的页面,{'地址': 下载时的分析层数}
    self._resource = {} # 需要下载的资源文件,{'地址': 下载的文件名}
    self._session = requests.Session() # 会话
    self._extra = [] # 外加的下载页面,不进行递归,补充不能递归获取的
    return
  
  def setExtra(self, extra: list):
    """设定外加页面list"""
    self._extra = extra
    return
    
  def _filter(self, url):
    """过滤url,返回True的时候才分析处理url,False则跳过.
    这里只规定页面资源要在顶url的目录下且不能有#和..,
    还有就是若出现常见图片格式的误链接也过滤,
    重写本函数可以做更多过滤
    """
    if ('#' in url) or ('..' in url):
      return False
    if (url.endswith('.jpg')) or (url.endswith('.png')):
      return False  
    if (url.endswith('.gif')) or (url.endswith('.bmp')):
      return False  
    if (self.urlHandler._path in url):
      return True
    return False
    
  def _handleUrl(self, url, level, isDebug = True):
    """处理一个url,并指定其当前层数为level,若层数达到任务最大层数
    则不做处理直接跳过.层数从0开始.
    具体处理方式为先看看自己是否已经处理,如果已处理,则看看上次
    处理的层数是否更靠前,若更靠前就不用处理了,否则需要进行下载以外
    的处理并更新记录.
    
    若isDebug为True,则不下载只提示"""
    downFlag = True # 是否需要下载本页面
    ## 层数过大,跳过
    if level >= self._level:
      return
    ## 已下载过,不再下载,看需不需要分析,如果不需要分析则跳过
    if url in self._htmlRecord:
      downFlag = False
      if self._htmlRecord[url] <= level:
        return
    ## 到这里则是需要分析了,是否再下载要看downFlag
    try:
      resp = self._session.get(url=url, headers=self._headers)
      text = resp.content.decode(self._encoding)
      soup = BeautifulSoup(text, 'lxml')
    except:
      print("获取分析"+url+"失败!")
      return
    # 获取需要递归分析的链接
    links = set()
    for link in soup.find_all('a'):
      if (not ('href' in link.attrs)):
        continue
      href = link.attrs['href']
      href = self.urlHandler.getFullUrl(href, url)
      if self._filter(href):
        links.add(href)
    if isDebug:
      print('debug - links:')
      print(links)
    # 获取图片资源
    self._getResource(soup, 'img', url)
    # 获取css
    self._getResource(soup, 'link', url)
    # 刷新本页面的分析层数记录,若有需要则下载本页面.下载本页面
    self._htmlRecord[url] = level
    filePath = self._outputDir+'\\'+self.urlHandler.url2filename(url)
    if downFlag:
      if isDebug:
        print("debug下载页面:"+url+"\n为:"+filePath)
        #print(self._htmlRecord)
      else:
        self._soupModify(soup, url)
        with open(filePath, 'wb') as f:
          f.write(soup.prettify().encode('utf-8'))
        print("下载页面成功:"+url)
    # 递归分析链接,并增加层数  
    for link in links:
      self._handleUrl(link, level+1, isDebug)
    return
  
  def _getResource(self, soup, name, url):
    """传入一个soup(beatifulsoup对象)以及其地址,
    获取其中标签为name的资源.
    name可以是img或link代表的css.目前只写了这2种,需要更多的就得重载了.
    目标加入到self._resource中.
    """
    targets = soup.find_all(name)
    # 统一资源链接为src,分类过滤,最终都下载到resource文件夹
    for target in targets:
      if name == 'img':
        if (not ('src' in target.attrs)):
          continue
        src = target.attrs['src']
      if name == 'link':
        if (not 'type' in target.attrs):
          continue
        if (target.attrs['type'] != r'text/css'):
          continue
        if (not 'href' in target.attrs):
          continue
        src = target.attrs['href']
      src = self.urlHandler.getFullUrl(src, url)
      fn = self.urlHandler.url2filename(src)
      self._resource[src] = fn
    return
    
  def _downResource(self, isDebug=True):
    """下载所有的resource,在爬虫最后使用"""
    if isDebug:
      print('debug - resource:')
      print(self._resource)
    for src in self._resource:
      filePath = self._outputDir+r'\resource'+'\\'+self._resource[src]
      if isDebug:
        print("debug下载资源:"+src+"\n为:"+filePath)
      else:
        if os.path.exists(filePath):
          print("资源:"+src+"已存在")
          continue
        try:
          resp = self._session.get(url=src, headers=self._headers) 
          with open(filePath, 'wb') as f:
            f.write(resp.content)
          print("下载资源成功:"+src)
        except:
          print("下载资源失败:"+src)
          continue
    return
  
  def _soupModify(self, soup, parentUrl):
    """对soup进行修改,使所有链接变成最终要写入硬盘的样子"""
    ## img
    for img in soup.find_all('img'):
      if not ('src' in img.attrs):
        continue
      fn = self.urlHandler.url2filename(img.attrs['src'], parentUrl)
      img.attrs['src'] = r'resource'+'\\'+fn
    ## css
    for style in soup.find_all('link'):
      if not ('href' in style.attrs):
        continue
      fn = self.urlHandler.url2filename(style.attrs['href'], parentUrl)
      style.attrs['href'] = r'resource'+'\\'+fn
    ## a
    for link in soup.find_all('a'):
      if not ('href' in link.attrs):
        continue
      if ('#' in link.attrs['href']):
        continue
      fn = self.urlHandler.url2filename(link.attrs['href'], parentUrl)
      link.attrs['href'] = fn
    return
  
  def main(self, isDebug = True):
    """正式执行,如果isDebug为True,则不下载只提示"""
    os.makedirs(self._outputDir, exist_ok=True)
    os.makedirs(self._outputDir+r'\resource', exist_ok=True)
    self._handleUrl(self._topUrl, 0, isDebug)
    for url in self._extra:
      self._handleUrl(url, self._level-1, isDebug)
    self._downResource(isDebug)
    return

## 开启脚本    
if __name__ == '__main__':
  topUrl = r'https://langrisser.org/l1/MD/index.htm'
  #http://www.fireemblem.net/cjb/zhuanti/warsong/indexp.htm
  #outputDir = r'F:\\Download\langrisser_org\l1\md5'
  outputDir = r'F:\Download\langrisser_org\md1_newtest1'
  level = 4
  headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
    'Referer': topUrl
  }
  #extra = [f'https://langrisser.org/4/6/x{i}.html' for i in range(1, 6)]
  
  m = MyWebGet(topUrl, outputDir, level, headers, 'gbk')
  #m.setExtra(extra)
  isDebug = False
  m.main(isDebug)
    